Pandas Patch In Action

import packages and download test data


In [27]:
from pandas_patch import *

In [30]:
%psource structure

In [11]:
def get_test_df_complete():
    """ get the full test dataset from Lending Club open source database,
    the purpose of this fuction is to be used in a demo ipython notebook """
    import requests
    from zipfile import ZipFile
    from StringIO import StringIO
    zip_to_download = "https://resources.lendingclub.com/LoanStats3b.csv.zip"
    r = requests.get(zip_to_download)
    zipfile = ZipFile(StringIO(r.content))
    file_csv = zipfile.namelist()[0]
    df = pd.read_csv(zipfile.open(file_csv), skiprows =[0], na_values = ['n/a','N/A',''],
     parse_dates = ['issue_d','last_pymnt_d','next_pymnt_d','last_credit_pull_d'] )
    zipfile.close()
    df = df[:-2]
    nb_row = float(len(df.index))
    df['na_col'] = np.nan
    df['constant_col'] = 'constant'
    df['duplicated_column'] = df.id
    df['many_missing_70'] = np.nan
    df.loc[1:int(0.3*nb_row),'many_missing_70'] = 1
    df['bad'] = 1
    index_good = df['loan_status'].isin(['Fully Paid', 'Current','In Grace Period'])
    df.loc[index_good,'bad'] = 0
    return df

In [12]:
# ipython tips
# with psource you can see the source code of a function
%psource pandas_patch

?nacount # to get info about the functions and docs

In [13]:
#df = get_test_df_complete()

# because no wifi connection 
df = get_test_df_complete()


/Users/efourrier/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:1164: DtypeWarning: Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)

In [19]:
df.columns


Out[19]:
Index([u'id', u'member_id', u'loan_amnt', u'funded_amnt', u'funded_amnt_inv', u'term', u'int_rate', u'installment', u'grade', u'sub_grade', u'emp_title', u'emp_length', u'home_ownership', u'annual_inc', u'is_inc_v', u'issue_d', u'loan_status', u'pymnt_plan', u'url', u'desc', u'purpose', u'title', u'zip_code', u'addr_state', u'dti', u'delinq_2yrs', u'earliest_cr_line', u'inq_last_6mths', u'mths_since_last_delinq', u'mths_since_last_record', u'open_acc', u'pub_rec', u'revol_bal', u'revol_util', u'total_acc', u'initial_list_status', u'out_prncp', u'out_prncp_inv', u'total_pymnt', u'total_pymnt_inv', u'total_rec_prncp', u'total_rec_int', u'total_rec_late_fee', u'recoveries', u'collection_recovery_fee', u'last_pymnt_d', u'last_pymnt_amnt', u'next_pymnt_d', u'last_credit_pull_d', u'collections_12_mths_ex_med', u'mths_since_last_major_derog', u'policy_code', u'na_col', u'constant_col', u'duplicated_column', u'many_missing_70', u'bad'], dtype='object')

Basic data cleaning and exploration

Basic helpers


In [20]:
df.nrow()


Out[20]:
188123

In [21]:
df.ncol()


Out[21]:
57

In [22]:
df.dfnum() #identify numeric variables


Out[22]:
['member_id',
 'loan_amnt',
 'funded_amnt',
 'funded_amnt_inv',
 'installment',
 'annual_inc',
 'dti',
 'delinq_2yrs',
 'inq_last_6mths',
 'mths_since_last_delinq',
 'mths_since_last_record',
 'open_acc',
 'pub_rec',
 'revol_bal',
 'total_acc',
 'out_prncp',
 'out_prncp_inv',
 'total_pymnt',
 'total_pymnt_inv',
 'total_rec_prncp',
 'total_rec_int',
 'total_rec_late_fee',
 'recoveries',
 'collection_recovery_fee',
 'last_pymnt_amnt',
 'collections_12_mths_ex_med',
 'mths_since_last_major_derog',
 'policy_code',
 'na_col',
 'many_missing_70',
 'bad']

In [23]:
df.dfchar() # identify character variables


Out[23]:
['id',
 'term',
 'int_rate',
 'grade',
 'sub_grade',
 'emp_title',
 'emp_length',
 'home_ownership',
 'is_inc_v',
 'loan_status',
 'pymnt_plan',
 'url',
 'desc',
 'purpose',
 'title',
 'zip_code',
 'addr_state',
 'earliest_cr_line',
 'revol_util',
 'initial_list_status',
 'constant_col',
 'duplicated_column']

In [24]:
timeit df.factors()


1 loops, best of 3: 1.98 s per loop

In [118]:
df.nacount(axis = 0) # counting the number of missing values per column


Out[118]:
Nanumber Napercentage
id 0 0.000000
member_id 0 0.000000
loan_amnt 0 0.000000
funded_amnt 0 0.000000
funded_amnt_inv 0 0.000000
term 0 0.000000
int_rate 0 0.000000
installment 0 0.000000
grade 0 0.000000
sub_grade 0 0.000000
emp_title 17734 0.059915
emp_length 0 0.000000
home_ownership 0 0.000000
annual_inc 0 0.000000
is_inc_v 0 0.000000
issue_d 0 0.000000
loan_status 0 0.000000
pymnt_plan 0 0.000000
url 0 0.000000
desc 232074 0.784068
purpose 0 0.000000
title 5 0.000017
zip_code 0 0.000000
addr_state 0 0.000000
dti 0 0.000000
delinq_2yrs 0 0.000000
earliest_cr_line 0 0.000000
fico_range_low 0 0.000000
fico_range_high 0 0.000000
inq_last_6mths 0 0.000000
mths_since_last_delinq 156366 0.528287
mths_since_last_record 251069 0.848243
open_acc 0 0.000000
pub_rec 0 0.000000
revol_bal 0 0.000000
revol_util 162 0.000547
total_acc 0 0.000000
initial_list_status 0 0.000000
out_prncp 0 0.000000
out_prncp_inv 0 0.000000
total_pymnt 0 0.000000
total_pymnt_inv 0 0.000000
total_rec_prncp 0 0.000000
total_rec_int 0 0.000000
total_rec_late_fee 0 0.000000
recoveries 0 0.000000
collection_recovery_fee 0 0.000000
last_pymnt_d 186 0.000628
last_pymnt_amnt 0 0.000000
next_pymnt_d 38805 0.131104
last_credit_pull_d 25 0.000084
last_fico_range_high 0 0.000000
last_fico_range_low 0 0.000000
collections_12_mths_ex_med 0 0.000000
mths_since_last_major_derog 223698 0.755770
policy_code 0 0.000000

In [119]:
df.nacount(axis = 1) # count the number of missing values per rows


Out[119]:
Nanumber Napercentage
0 4 0.071429
1 4 0.071429
2 2 0.035714
3 1 0.017857
4 2 0.035714
5 3 0.053571
6 2 0.035714
7 3 0.053571
8 4 0.071429
9 4 0.071429
10 2 0.035714
11 2 0.035714
12 4 0.071429
13 3 0.053571
14 3 0.053571
15 4 0.071429
16 2 0.035714
17 4 0.071429
18 4 0.071429
19 2 0.035714
20 2 0.035714
21 2 0.035714
22 4 0.071429
23 4 0.071429
24 2 0.035714
25 2 0.035714
26 2 0.035714
27 3 0.053571
28 4 0.071429
29 4 0.071429
... ... ...
295957 3 0.053571
295958 3 0.053571
295959 1 0.017857
295960 2 0.035714
295961 2 0.035714
295962 3 0.053571
295963 3 0.053571
295964 5 0.089286
295965 1 0.017857
295966 3 0.053571
295967 3 0.053571
295968 4 0.071429
295969 3 0.053571
295970 3 0.053571
295971 4 0.071429
295972 4 0.071429
295973 2 0.035714
295974 4 0.071429
295975 4 0.071429
295976 1 0.017857
295977 4 0.071429
295978 2 0.035714
295979 2 0.035714
295980 2 0.035714
295981 4 0.071429
295982 3 0.053571
295983 3 0.053571
295984 2 0.035714
295985 2 0.035714
295986 3 0.053571

295987 rows × 2 columns


In [96]:
df.constantcol() # find the constant columns


Out[96]:
['policy_code']

In [97]:
df.findupcol() # find the duplicate columns


Out[97]:
[]

In [138]:
timeit df.detectkey(pct = )


1 loops, best of 3: 1.37 s per loop

In [37]:
timeit df.apply(lambda x: len(pd.unique(x)))


1 loops, best of 3: 7.66 s per loop

In [38]:
timeit df.count_unique()


1 loops, best of 3: 7.77 s per loop

In [ ]:


In [20]:
df.manymissing(a = 0.7)


Out[20]:
Index([u'desc', u'mths_since_last_record', u'mths_since_last_major_derog'], dtype='object')

Summary of strucuture, data info and cleaning functions


In [31]:
timeit df.structure()


1 loops, best of 3: 11.1 s per loop

In [22]:
df.psummary(dynamic = True)


there are 0 duplicated rows

the columns with more than 70.00% manymissing values:
['desc', 'mths_since_last_record', 'mths_since_last_major_derog'] 

the columns with less than 5.00% manymissing values are :
['title', 'revol_util', 'last_pymnt_d', 'last_credit_pull_d'] 
 you should fill them with median or most common value 

the detected keys of the dataset are:
['id', 'member_id', 'url'] 

the duplicated columns of the dataset are:
[]

the constant columns of the dataset are:
['policy_code']

the columns with nearzerovariance are:
['pymnt_plan', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'next_pymnt_d', 'last_credit_pull_d', 'collections_12_mths_ex_med']

0.999999954602
0.9999995703
0.999997894256
0.999997750074
0.999995699105
0.990938422868
0.967614558757
0.951902875674
0.875725744027
the columns highly correlated to others to remove are:
['fico_range_low', 'funded_amnt', 'total_pymnt_inv', 'out_prncp_inv', 'funded_amnt_inv', 'id', 'total_pymnt', 'loan_amnt']

these columns contains big strings :
['emp_title', 'url', 'desc']


In [27]:
df.str


Out[27]:
id member_id loan_amnt funded_amnt funded_amnt_inv term int_rate installment grade sub_grade ... collection_recovery_fee last_pymnt_d last_pymnt_amnt next_pymnt_d last_credit_pull_d last_fico_range_high last_fico_range_low collections_12_mths_ex_med mths_since_last_major_derog policy_code
0 29003494 31526675 28000 28000 28000 36 months 0.071 866.10 A A3 ... 0 14-Nov 866.10 14-Dec 14-Nov 719 715 0 NaN 1
1 28913233 31436395 3000 3000 3000 36 months 0.117 99.18 B B4 ... 0 14-Oct 99.18 14-Dec 14-Nov 689 685 0 NaN 1
2 27592013 30095159 18000 18000 18000 36 months 0.140 615.03 C C3 ... 0 14-Nov 615.03 14-Dec 14-Nov 699 695 0 27 1
3 28172444 30685592 15000 15000 15000 60 months 0.202 399.08 E E3 ... 0 14-Nov 399.08 14-Dec 14-Nov 664 660 0 70 1
4 20349337 22622030 23275 23275 23275 60 months 0.176 585.61 D D4 ... 0 14-Nov 585.61 15-Jan 14-Nov 709 705 0 57 1
5 27502482 30005666 21000 21000 21000 60 months 0.140 488.42 C C3 ... 0 14-Nov 488.42 14-Dec 14-Nov 744 740 0 NaN 1
6 27702335 30215519 7800 7800 7800 36 months 0.110 255.33 B B3 ... 0 14-Nov 255.33 14-Dec 14-Nov 674 670 0 43 1
7 28032412 30545550 13625 13625 13625 36 months 0.163 480.97 D D2 ... 0 14-Nov 480.97 14-Dec 14-Nov 719 715 0 NaN 1
8 28032523 30545669 17225 17225 17225 60 months 0.140 400.62 C C3 ... 0 14-Nov 400.62 14-Dec 14-Nov 759 755 0 NaN 1
9 28042338 30555469 8150 8150 8150 36 months 0.092 259.82 B B1 ... 0 14-Nov 259.82 14-Dec 14-Nov 769 765 0 NaN 1
10 28052496 30565639 2000 2000 2000 36 months 0.092 63.76 B B1 ... 0 14-Nov 63.76 14-Dec 14-Nov 709 705 0 25 1
11 28062557 30575706 25000 25000 25000 60 months 0.145 588.08 C C4 ... 0 14-Nov 588.08 14-Dec 14-Nov 709 705 0 69 1
12 28072219 30585351 20000 20000 20000 60 months 0.130 454.96 C C1 ... 0 14-Nov 454.96 14-Dec 14-Nov 749 745 0 NaN 1
13 28072334 30585477 5000 5000 5000 36 months 0.110 163.67 B B3 ... 0 14-Nov 163.67 14-Dec 14-Nov 754 750 0 NaN 1
14 28102226 30615365 9000 9000 9000 36 months 0.125 301.04 B B5 ... 0 14-Nov 301.04 14-Dec 14-Nov 704 700 0 NaN 1
15 28102242 30615381 8000 8000 8000 36 months 0.163 282.41 D D2 ... 0 14-Nov 282.41 14-Dec 14-Nov 709 705 0 NaN 1
16 28112391 30625533 18625 18625 18625 60 months 0.163 455.80 D D2 ... 0 14-Nov 455.80 14-Dec 14-Nov 739 735 0 30 1
17 28122281 30635409 6825 6825 6825 36 months 0.150 236.56 C C5 ... 0 14-Nov 236.56 14-Dec 14-Nov 684 680 0 NaN 1
18 28122316 30635450 7000 7000 7000 36 months 0.145 240.92 C C4 ... 0 14-Nov 240.92 14-Dec 14-Nov 669 665 0 NaN 1
19 28122319 30635453 15000 15000 15000 60 months 0.163 367.09 D D2 ... 0 14-Nov 367.09 14-Dec 14-Nov 724 720 0 46 1
20 28132525 30645679 8000 8000 8000 36 months 0.125 267.60 B B5 ... 0 14-Nov 267.60 14-Dec 14-Nov 719 715 0 16 1
21 28192471 30705604 20000 20000 20000 60 months 0.150 475.70 C C5 ... 0 14-Nov 475.70 14-Dec 14-Nov 714 710 0 41 1
22 28202142 30715268 6075 6075 6075 36 months 0.182 220.36 D D5 ... 0 14-Nov 220.36 14-Dec 14-Nov 689 685 0 NaN 1
23 28212255 30725390 20000 20000 20000 60 months 0.110 434.75 B B3 ... 0 14-Nov 434.75 14-Dec 14-Nov 714 710 0 NaN 1
24 27611626 30114751 6400 6400 6400 36 months 0.170 228.15 D D3 ... 0 14-Nov 228.15 14-Dec 14-Nov 669 665 0 NaN 1
25 28032263 30545390 6075 6075 6075 36 months 0.150 210.57 C C5 ... 0 14-Nov 210.57 14-Dec 14-Nov 674 670 0 74 1
26 28042070 30555187 16000 16000 16000 60 months 0.134 366.93 C C2 ... 0 14-Nov 366.93 14-Dec 14-Nov 699 695 0 62 1
27 28042197 30555320 20000 20000 20000 36 months 0.117 661.14 B B4 ... 0 14-Nov 661.14 14-Dec 14-Nov 674 670 0 NaN 1
28 28052043 30565161 21000 21000 20900 36 months 0.102 679.10 B B2 ... 0 14-Nov 679.10 14-Dec 14-Nov 754 750 0 NaN 1
29 28052390 30565524 4400 4400 4400 36 months 0.163 155.33 D D2 ... 0 14-Nov 155.33 14-Dec 14-Nov 764 760 0 NaN 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
295957 2381629 2844317 10000 10000 10000 36 months 0.141 342.22 B B5 ... 0 14-Sep 5307.47 NaN 14-Sep 704 700 0 25 1
295958 2371607 2834360 28000 28000 28000 36 months 0.089 889.09 A A5 ... 0 14-Nov 889.09 14-Dec 14-Nov 734 730 0 NaN 1
295959 2381552 2844239 13975 13975 13975 36 months 0.178 503.62 D D1 ... 0 14-Oct 503.62 14-Dec 14-Nov 554 550 0 25 1
295960 2371355 2834090 10575 10575 10575 36 months 0.153 368.20 C C2 ... 0 14-Nov 368.20 14-Dec 14-Nov 589 585 0 NaN 1
295961 2381291 2843953 10000 10000 10000 36 months 0.102 323.43 B B1 ... 0 14-Nov 323.43 14-Dec 14-Nov 714 710 0 80 1
295962 1867268 2169638 24600 24600 24575 60 months 0.102 524.62 B B1 ... 0 14-Nov 524.62 14-Dec 14-Nov 759 755 0 NaN 1
295963 2380286 2842889 8500 8500 8500 36 months 0.060 258.71 A A1 ... 0 14-Nov 258.71 14-Dec 14-Nov 814 810 0 NaN 1
295964 2369570 2832161 16000 16000 16000 60 months 0.089 331.36 A A5 ... 0 14-Jul 174.92 NaN 14-Nov 794 790 0 NaN 1
295965 2380722 2843349 20000 20000 20000 60 months 0.131 456.19 B B4 ... 0 14-Nov 456.19 14-Dec 14-Nov 744 740 0 31 1
295966 2380728 2843355 18225 18225 18225 36 months 0.089 578.71 A A5 ... 0 14-Nov 578.71 14-Dec 14-Nov 729 725 0 NaN 1
295967 2370680 2833360 6000 6000 6000 36 months 0.102 194.06 B B1 ... 0 14-May 194.06 NaN 14-Oct 529 525 0 NaN 1
295968 2380321 2842924 16750 16750 16750 36 months 0.102 541.74 B B1 ... 0 14-Nov 541.74 14-Dec 14-Nov 714 710 0 NaN 1
295969 2370278 2832933 12000 12000 12000 36 months 0.102 388.11 B B1 ... 0 14-Nov 388.11 14-Dec 14-Nov 729 725 0 NaN 1
295970 2366346 2828664 19750 19750 19750 60 months 0.230 556.20 F F1 ... 0 14-Nov 15488.41 NaN 14-Nov 714 710 0 NaN 1
295971 2308648 2741109 20000 20000 20000 36 months 0.089 635.07 A A5 ... 0 14-Nov 635.07 14-Dec 14-Nov 724 720 0 NaN 1
295972 2379355 2841866 6000 6000 6000 36 months 0.205 224.49 E E1 ... 0 14-Aug 3517.06 NaN 14-Aug 684 680 0 NaN 1
295973 2301746 2734342 15000 15000 15000 36 months 0.141 513.33 B B5 ... 0 14-Nov 513.33 14-Dec 14-Nov 679 675 0 49 1
295974 2377558 2839922 18775 18775 18775 36 months 0.102 607.23 B B1 ... 0 14-Nov 607.23 14-Dec 14-Nov 714 710 0 NaN 1
295975 2377252 2839601 15000 15000 15000 36 months 0.111 492.08 B B2 ... 0 14-Nov 492.08 14-Dec 14-Nov 614 610 0 NaN 1
295976 2366519 2828844 35000 35000 34900 60 months 0.205 936.86 E E1 ... 0 14-Nov 936.86 14-Dec 14-Nov 699 695 0 34 1
295977 2367122 2829498 24000 24000 24000 36 months 0.066 736.89 A A2 ... 0 14-Nov 736.89 14-Dec 14-Nov 699 695 0 NaN 1
295978 2298828 2731287 23000 23000 23000 36 months 0.141 787.10 B B5 ... 0 14-Nov 787.10 14-Dec 14-Nov 699 695 0 NaN 1
295979 2375433 2837641 25000 25000 24950 36 months 0.131 843.68 B B4 ... 0 14-Nov 843.68 14-Dec 14-Nov 694 690 0 52 1
295980 2365716 2827951 21000 21000 21000 36 months 0.089 666.82 A A5 ... 0 14-Nov 666.82 14-Dec 14-Nov 734 730 0 6 1
295981 2375143 2837335 11500 11500 11450 36 months 0.111 377.26 B B2 ... 0 14-Nov 377.26 14-Dec 14-Nov 694 690 0 NaN 1
295982 2334898 2777049 16000 16000 15750 60 months 0.102 341.22 B B1 ... 0 14-Nov 341.22 14-Dec 14-Nov 729 725 0 NaN 1
295983 2375068 2837252 18000 18000 17950 36 months 0.188 657.54 D D3 ... 0 14-Nov 657.54 14-Dec 14-Nov 644 640 0 NaN 1
295984 2374791 2836963 17000 17000 17000 36 months 0.121 565.62 B B3 ... 0 14-Nov 565.62 14-Dec 14-Nov 674 670 0 22 1
295985 2301035 2733598 31500 31500 31500 36 months 0.121 1048.06 B B3 ... 0 14-Nov 1048.06 14-Dec 14-Nov 724 720 0 45 1
295986 2300581 2733123 28000 28000 27750 36 months 0.079 876.13 A A4 ... 0 14-Nov 876.13 14-Dec 14-Nov 754 750 0 NaN 1

295987 rows × 56 columns


In [48]:
%psource structure

In [49]:
df.nacount(axis = 0).Napercentage


Out[49]:
id                             0.000000
member_id                      0.000000
loan_amnt                      0.000000
funded_amnt                    0.000000
funded_amnt_inv                0.000000
term                           0.000000
int_rate                       0.000000
installment                    0.000000
grade                          0.000000
sub_grade                      0.000000
emp_title                      0.059915
emp_length                     0.000000
home_ownership                 0.000000
annual_inc                     0.000000
is_inc_v                       0.000000
issue_d                        0.000000
loan_status                    0.000000
pymnt_plan                     0.000000
url                            0.000000
desc                           0.784068
purpose                        0.000000
title                          0.000017
zip_code                       0.000000
addr_state                     0.000000
dti                            0.000000
delinq_2yrs                    0.000000
earliest_cr_line               0.000000
fico_range_low                 0.000000
fico_range_high                0.000000
inq_last_6mths                 0.000000
mths_since_last_delinq         0.528287
mths_since_last_record         0.848243
open_acc                       0.000000
pub_rec                        0.000000
revol_bal                      0.000000
revol_util                     0.000547
total_acc                      0.000000
initial_list_status            0.000000
out_prncp                      0.000000
out_prncp_inv                  0.000000
total_pymnt                    0.000000
total_pymnt_inv                0.000000
total_rec_prncp                0.000000
total_rec_int                  0.000000
total_rec_late_fee             0.000000
recoveries                     0.000000
collection_recovery_fee        0.000000
last_pymnt_d                   0.000628
last_pymnt_amnt                0.000000
next_pymnt_d                   0.131104
last_credit_pull_d             0.000084
last_fico_range_high           0.000000
last_fico_range_low            0.000000
collections_12_mths_ex_med     0.000000
mths_since_last_major_derog    0.755770
policy_code                    0.000000
Name: Napercentage, Length: 56, dtype: float64

In [32]:
%timeit df.nacount()


1 loops, best of 3: 239 ms per loop

In [34]:
timeit df.count_unique()


1 loops, best of 3: 7.47 s per loop

In [67]:
df.count()


Out[67]:
id                             295987
member_id                      295987
loan_amnt                      295987
funded_amnt                    295987
funded_amnt_inv                295987
term                           295987
int_rate                       295987
installment                    295987
grade                          295987
sub_grade                      295987
emp_title                      278253
emp_length                     295987
home_ownership                 295987
annual_inc                     295987
is_inc_v                       295987
issue_d                        295987
loan_status                    295987
pymnt_plan                     295987
url                            295987
desc                            63913
purpose                        295987
title                          295982
zip_code                       295987
addr_state                     295987
dti                            295987
delinq_2yrs                    295987
earliest_cr_line               295987
fico_range_low                 295987
fico_range_high                295987
inq_last_6mths                 295987
mths_since_last_delinq         139621
mths_since_last_record          44918
open_acc                       295987
pub_rec                        295987
revol_bal                      295987
revol_util                     295825
total_acc                      295987
initial_list_status            295987
out_prncp                      295987
out_prncp_inv                  295987
total_pymnt                    295987
total_pymnt_inv                295987
total_rec_prncp                295987
total_rec_int                  295987
total_rec_late_fee             295987
recoveries                     295987
collection_recovery_fee        295987
last_pymnt_d                   295801
last_pymnt_amnt                295987
next_pymnt_d                   257182
last_credit_pull_d             295962
last_fico_range_high           295987
last_fico_range_low            295987
collections_12_mths_ex_med     295987
mths_since_last_major_derog     72289
policy_code                    295987
Length: 56, dtype: int64

In [68]:
df.nacount()


Out[68]:
Nanumber Napercentage
id 0 0.000000
member_id 0 0.000000
loan_amnt 0 0.000000
funded_amnt 0 0.000000
funded_amnt_inv 0 0.000000
term 0 0.000000
int_rate 0 0.000000
installment 0 0.000000
grade 0 0.000000
sub_grade 0 0.000000
emp_title 17734 0.059915
emp_length 0 0.000000
home_ownership 0 0.000000
annual_inc 0 0.000000
is_inc_v 0 0.000000
issue_d 0 0.000000
loan_status 0 0.000000
pymnt_plan 0 0.000000
url 0 0.000000
desc 232074 0.784068
purpose 0 0.000000
title 5 0.000017
zip_code 0 0.000000
addr_state 0 0.000000
dti 0 0.000000
delinq_2yrs 0 0.000000
earliest_cr_line 0 0.000000
fico_range_low 0 0.000000
fico_range_high 0 0.000000
inq_last_6mths 0 0.000000
mths_since_last_delinq 156366 0.528287
mths_since_last_record 251069 0.848243
open_acc 0 0.000000
pub_rec 0 0.000000
revol_bal 0 0.000000
revol_util 162 0.000547
total_acc 0 0.000000
initial_list_status 0 0.000000
out_prncp 0 0.000000
out_prncp_inv 0 0.000000
total_pymnt 0 0.000000
total_pymnt_inv 0 0.000000
total_rec_prncp 0 0.000000
total_rec_int 0 0.000000
total_rec_late_fee 0 0.000000
recoveries 0 0.000000
collection_recovery_fee 0 0.000000
last_pymnt_d 186 0.000628
last_pymnt_amnt 0 0.000000
next_pymnt_d 38805 0.131104
last_credit_pull_d 25 0.000084
last_fico_range_high 0 0.000000
last_fico_range_low 0 0.000000
collections_12_mths_ex_med 0 0.000000
mths_since_last_major_derog 223698 0.755770
policy_code 0 0.000000

In [79]:
1 >= 2


Out[79]:
False

In [80]:
df.int_rate.dtype


Out[80]:
dtype('float64')

In [103]:


In [127]:
df.sample_df(pct = 0.10).nrow()


Out[127]:
29598

In [10]:
df.factors()


1 loops, best of 3: 804 ms per loop

In [15]:
timeit df.detectkey(pct = 0.05)


1 loops, best of 3: 642 ms per loop

In [16]:
timeit df.detectkey2()


1 loops, best of 3: 3.37 s per loop

In [18]:
df.nearzerovar()


                               freq_ratio    nzv  percent_unique zero_var
id                               1.000000  False      100.000000    False
member_id                        1.000000  False      100.000000    False
loan_amnt                        1.319625  False        0.446303    False
funded_amnt                      1.319625  False        0.446303    False
funded_amnt_inv                  1.350674  False        0.457115    False
term                             2.556126  False        0.000676    False
int_rate                         1.049433  False        0.033785    False
installment                      1.211738  False       13.732698    False
grade                            1.042449  False        0.002365    False
sub_grade                        1.019850  False        0.011825    False
emp_title                        1.253420  False       44.487765    False
emp_length                       3.964490  False        0.004054    False
home_ownership                   1.358463  False        0.001351    False
annual_inc                       1.116673  False        7.601347    False
is_inc_v                         1.177140  False        0.001014    False
issue_d                          1.534426  False        0.007095    False
loan_status                      7.741702  False        0.002365    False
pymnt_plan                    3362.488636   True        0.000676    False
url                              1.000000  False      100.000000    False
desc                             1.100000  False       21.210053    False
purpose                          2.482576  False        0.004392    False
title                            2.558264  False       11.264684    False
zip_code                         1.028125  False        0.292243    False
addr_state                       1.795696  False        0.016893    False
dti                              1.145299  False        1.321680    False
delinq_2yrs                      6.479383  False        0.007771    False
earliest_cr_line                 1.011780  False        0.219604    False
fico_range_low                   1.025028  False        0.012838    False
fico_range_high                  1.025028  False        0.012838    False
inq_last_6mths                   1.814355  False        0.002365    False
mths_since_last_delinq           1.009972  False        0.047637    False
mths_since_last_record           1.012972  False        0.041218    False
open_acc                         1.021711  False        0.020271    False
pub_rec                          6.443615  False        0.007095    False
revol_bal                       21.151515  False       17.034870    False
revol_util                       1.272727  False        0.385490    False
total_acc                        1.001182  False        0.034461    False
initial_list_status              1.532769  False        0.000676    False
out_prncp                      337.547826  False       49.844419    False
out_prncp_inv                  340.508772  False       52.214793    False
total_pymnt                      1.447674  False       57.532256    False
total_pymnt_inv                  1.364198  False       59.522547    False
total_rec_prncp                  1.229989  False       48.328136    False
total_rec_int                    1.512195  False       49.332910    False
total_rec_late_fee             856.357558   True        0.307446    False
recoveries                   58780.200000   True        0.688206    False
collection_recovery_fee      98034.666667   True        0.630095    False
last_pymnt_d                     6.877148  False        0.007771    False
last_pymnt_amnt                  1.228931  False       23.844290    False
next_pymnt_d                    40.179052   True        0.001689    False
last_credit_pull_d              59.006795   True        0.008784    False
last_fico_range_high             1.028124  False        0.024325    False
last_fico_range_low              1.028124  False        0.023988    False
collections_12_mths_ex_med     117.000399   True        0.002365    False
mths_since_last_major_derog      1.013688  False        0.054732    False
policy_code                      1.000000  False        0.000338     True
Out[18]:
Index([u'pymnt_plan', u'total_rec_late_fee', u'recoveries', u'collection_recovery_fee', u'next_pymnt_d', u'last_credit_pull_d', u'collections_12_mths_ex_med'], dtype='object')

In [39]:
def pandas_to_ndarray_wrap(X, copy=True):
    """
    Converts X to a ndarray and provides a function to help convert back
    to pandas object.
    Parameters
    ----------
    X : Series/DataFrame/ndarray
    copy : Boolean
        If True, return a copy.
    Returns
    -------
    Xvals : ndarray
        If X is a Series/DataFrame, then Xvals = X.values,
        if ndarray, Xvals = X
    F : Function
        F(Xvals) = X
    """
    if copy:
        X = X.copy()

    if isinstance(X, pd.Series):
        return X.values, lambda Z: pd.Series(np.squeeze(Z), index=X.index)
    elif isinstance(X, pd.DataFrame):
        return X.values, lambda Z: pd.DataFrame(
            Z, index=X.index, columns=X.columns)
    elif isinstance(X, np.ndarray) or isspmatrix(X):
        return X, lambda Z: Z
    else:
        raise ValueError("Unhandled type: %s" % type(X))

In [40]:
pandas_to_ndarray_wrap(df)


Out[40]:
(array([[10159548, 12011167.0, 15000.0, ..., 10159548, nan, 0],
        [10149488, 12001033.0, 4800.0, ..., 10149488, 1.0, 0],
        [10129403, 11981032.0, 7550.0, ..., 10129403, 1.0, 0],
        ..., 
        ['1059224', 1290827.0, 35000.0, ..., '1059224', nan, 0],
        ['1058722', 1290521.0, 12000.0, ..., '1058722', nan, 1],
        ['1058291', 1289878.0, 12000.0, ..., '1058291', nan, 0]], dtype=object),
 <function __main__.<lambda>>)

In [41]:
pandas_to_ndarray_wrap(df)[0]


Out[41]:
array([[10159548, 12011167.0, 15000.0, ..., 10159548, nan, 0],
       [10149488, 12001033.0, 4800.0, ..., 10149488, 1.0, 0],
       [10129403, 11981032.0, 7550.0, ..., 10129403, 1.0, 0],
       ..., 
       ['1059224', 1290827.0, 35000.0, ..., '1059224', nan, 0],
       ['1058722', 1290521.0, 12000.0, ..., '1058722', nan, 1],
       ['1058291', 1289878.0, 12000.0, ..., '1058291', nan, 0]], dtype=object)

In [44]:
df.size


Out[44]:
10723011

In [45]:
timeit df.duplicated()


1 loops, best of 3: 1.08 s per loop

In [ ]: